Considering Outliers and Novelty Detection


In [ ]:
import numpy as np
from scipy.stats.stats import pearsonr
np.random.seed(101)
normal = np.random.normal(loc=0.0, scale= 1.0, size=1000)
print 'Mean: %0.3f Median: %0.3f Variance: %0.3f' % (np.mean(normal), np.median(normal), np.var(normal))

outlying = normal.copy()
outlying[0] = 50.0
print 'Mean: %0.3f Median: %0.3f Variance: %0.3f' % (np.mean(outlying), np.median(outlying), np.var(outlying))
print 'Pearson''s correlation coefficient: %0.3f p-value: %0.3f' % pearsonr(normal,outlying)

Finding more things that can go wrong with your data

Understanding the difference between anomalies and novel data

Examining a Fast and Simple Univariate Method


In [2]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()

Samples total 442
Dimensionality 10
Features real, -.2 < x < .2
Targets integer 25 - 346


In [3]:
X,y = diabetes.data, diabetes.target

In [4]:
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
df = pd.DataFrame(X)
print df.describe()


           0      1      2      3      4      5      6      7      8      9
count 442.00 442.00 442.00 442.00 442.00 442.00 442.00 442.00 442.00 442.00
mean   -0.00   0.00  -0.00   0.00  -0.00   0.00  -0.00   0.00  -0.00  -0.00
std     0.05   0.05   0.05   0.05   0.05   0.05   0.05   0.05   0.05   0.05
min    -0.11  -0.04  -0.09  -0.11  -0.13  -0.12  -0.10  -0.08  -0.13  -0.14
25%    -0.04  -0.04  -0.03  -0.04  -0.03  -0.03  -0.04  -0.04  -0.03  -0.03
50%     0.01  -0.04  -0.01  -0.01  -0.00  -0.00  -0.01  -0.00  -0.00  -0.00
75%     0.04   0.05   0.03   0.04   0.03   0.03   0.03   0.03   0.03   0.03
max     0.11   0.05   0.17   0.13   0.15   0.20   0.18   0.19   0.13   0.14

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import pylab as pl
box_plots = df.boxplot(return_type='dict')


Leveraging on the Gaussian distribution


In [6]:
from sklearn.preprocessing import StandardScaler
Xs = StandardScaler().fit_transform(X)
o_idx = np.where(np.abs(Xs)>3)
# .any(1) method will avoid duplicating 
print df[(np.abs(Xs)>3).any(1)]


        0     1     2     3     4     5     6     7     8     9
58   0.04 -0.04 -0.06  0.04  0.01 -0.06  0.18 -0.08 -0.00 -0.05
123  0.01  0.05  0.03 -0.00  0.15  0.20 -0.06  0.19  0.02  0.07
216  0.01  0.05  0.04  0.05  0.05  0.07 -0.07  0.15  0.05  0.05
230 -0.04  0.05  0.07 -0.06  0.15  0.16  0.00  0.07  0.05  0.07
256 -0.05 -0.04  0.16 -0.05 -0.03 -0.02 -0.05  0.03  0.03  0.01
260  0.04 -0.04 -0.01 -0.06  0.01 -0.03  0.15 -0.08 -0.08 -0.02
261  0.05 -0.04 -0.04  0.10  0.04 -0.03  0.18 -0.08 -0.01  0.02
269  0.01 -0.04 -0.03 -0.03  0.04 -0.01  0.16 -0.08 -0.01 -0.04
322  0.02  0.05  0.06  0.06  0.02 -0.04 -0.09  0.16  0.13  0.08
336 -0.02 -0.04  0.09 -0.04  0.09  0.09 -0.06  0.15  0.08  0.05
367 -0.01  0.05  0.17  0.01  0.03  0.03 -0.02  0.03  0.03  0.03
441 -0.05 -0.04 -0.07 -0.08  0.08  0.03  0.17 -0.04 -0.00  0.00

Making assumptions and checking out


In [7]:
from scipy.stats.mstats import winsorize
Xs_w = winsorize(Xs, limits=(0.05, 0.95))

Xs_c = Xs.copy()
Xs_c[o_idx] = np.sign(Xs_c[o_idx]) * 3

Developing A Multivariate Approach

Using principal component analysis


In [19]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from pandas.tools.plotting import scatter_matrix
dim_reduction = PCA()
Xc = dim_reduction.fit_transform(scale(X))

print 'variance explained by the first 2 components: %0.1f%%' % (sum(dim_reduction.explained_variance_ratio_[:2]*100))
print 'variance explained by the last 2 components: %0.1f%%' % (sum(dim_reduction.explained_variance_ratio_[-2:]*100))

df = pd.DataFrame(Xc, columns=['comp_'+str(j) for j in range(10)])
first_two = df.plot(kind='scatter', x='comp_0', y='comp_1', c='DarkGray', s=50)
last_two  = df.plot(kind='scatter', x='comp_8', y='comp_9', c='DarkGray', s=50)


variance explained by the first 2 components: 55.2%
variance explained by the last 2 components: 0.9%

In [9]:
print 'variance explained by the first 3 components: %0.1f%%' % (sum(dim_reduction.explained_variance_ratio_[:3]*100))
scatter_first = scatter_matrix(pd.DataFrame(Xc[:,:3], columns=['comp1','comp2','comp3']), 
                         alpha=0.3, figsize=(15, 15), diagonal='kde',  marker='o', grid=True)

scatter_last = scatter_matrix(pd.DataFrame(Xc[:,-2:], columns=['comp9','comp10']), 
                         alpha=0.3, figsize=(15, 15), diagonal='kde',  marker='o', grid=True)


variance explained by the first 3 components: 67.2%

In [10]:
outlying = (Xc[:,-1] < -0.3) | (Xc[:,-2] < -1.0)
print df[outlying]


     comp_0  comp_1  comp_2  comp_3  comp_4  comp_5  comp_6  comp_7  comp_8  \
23    -3.77   -1.76    1.09    0.72   -0.64   -1.90    0.56    1.09   -0.44   
58     2.65    2.23    2.79   -0.63    0.26    0.13    1.44    0.67   -1.01   
110    2.04   -0.76    0.74   -1.93   -0.07   -0.24   -1.75   -0.41   -0.47   
169   -2.35    0.15   -0.13    1.19   -0.64   -0.64    2.65   -0.31   -0.22   
254   -3.82   -1.03    1.06    0.44    0.27   -0.86    0.97    0.66   -0.43   
322   -4.52   -2.24   -0.14    0.85   -0.47   -0.73    1.28    0.34   -1.39   
323   -3.87   -0.69    0.26   -0.58   -0.97   -0.76    1.79    0.36   -0.69   
353   -0.98    1.61   -1.16    1.14   -0.36   -1.46    2.53    0.90    0.02   
371   -2.11   -0.28    0.64   -0.65   -0.36    0.26    2.22    1.09   -0.07   
394   -2.24   -1.13    0.51    1.54   -1.30    0.12    2.28   -0.10   -0.40   

     comp_9  
23    -0.50  
58     0.21  
110   -0.31  
169   -0.50  
254   -0.33  
322   -0.38  
323   -0.40  
353   -0.50  
371   -0.35  
394   -0.43  

Using cluster analysis


In [21]:
from sklearn.cluster import DBSCAN
DB = DBSCAN(eps=2.5, min_samples=25)
DB.fit(Xc)

from collections import Counter
print Counter(DB.labels_)

print df[DB.labels_==-1]


Counter({0: 414, -1: 28})
     comp_0  comp_1  comp_2  comp_3  comp_4  comp_5  comp_6  comp_7  comp_8  \
15    -2.95    0.95   -1.98   -0.79    1.77    1.05    1.38   -0.88   -0.35   
23    -3.77   -1.76    1.09    0.72   -0.64   -1.90    0.56    1.09   -0.44   
29     0.36   -1.41    3.27   -0.67    0.05   -0.97    0.96    1.10    0.17   
35     0.70    1.88    1.36   -1.58    0.02   -1.12   -0.04    2.12   -0.32   
78     1.57   -1.29   -0.66    0.50   -1.64    0.64    1.98   -0.23    0.22   
117   -3.49    0.19    1.29    0.46   -0.80   -1.71    0.84   -1.77   -0.09   
123   -5.67    2.16   -2.83   -0.98    0.43   -0.22   -1.25   -0.85   -0.70   
141   -3.31   -0.02   -1.76    1.79   -0.94   -1.92   -1.10   -0.77   -0.64   
161   -5.13    1.23   -1.13    0.32    1.91   -0.54   -0.07   -0.15   -0.00   
169   -2.35    0.15   -0.13    1.19   -0.64   -0.64    2.65   -0.31   -0.22   
230   -3.93    2.59   -2.08    0.03    1.03   -0.84   -1.40    1.38    0.34   
248   -4.05    2.23   -1.18    1.33    1.02    0.14    0.35   -0.92   -0.23   
251   -4.15   -0.74   -0.73    1.13    1.74    0.38    0.44    0.21   -0.53   
261    1.30    2.40    3.81   -0.57    1.13   -0.22    0.86   -0.27   -0.80   
276   -1.90    3.18    0.79    0.45    0.89    0.93    0.86    0.20    0.36   
321   -4.64   -0.12    0.59    0.91   -1.64    0.36    0.82   -1.09   -0.90   
322   -4.52   -2.24   -0.14    0.85   -0.47   -0.73    1.28    0.34   -1.39   
323   -3.87   -0.69    0.26   -0.58   -0.97   -0.76    1.79    0.36   -0.69   
336   -4.07    1.13   -2.09    1.95   -0.71   -0.26   -0.43    0.15   -0.69   
349    1.53   -2.64   -1.94   -1.33   -1.07    0.79    0.11   -0.77   -0.19   
352    3.37   -0.12    1.20   -1.54    0.04   -0.72    0.64    1.65   -0.74   
353   -0.98    1.61   -1.16    1.14   -0.36   -1.46    2.53    0.90    0.02   
367   -2.72   -0.69    0.10    1.26    1.01    0.94   -1.66    1.60   -0.07   
376   -3.33    2.43   -1.49    1.05   -0.09    0.89   -0.39   -0.05    0.06   
394   -2.24   -1.13    0.51    1.54   -1.30    0.12    2.28   -0.10   -0.40   
405   -0.66   -3.94    1.79    0.87   -0.13    1.97    0.09    0.77   -0.14   
422   -1.76    0.95   -0.32    0.58    2.66    0.66    0.04    1.12    0.40   
441    1.90    3.98   -0.05   -0.22    0.60   -1.65    0.25    1.19   -0.73   

     comp_9  
15     0.09  
23    -0.50  
29     0.03  
35     0.11  
78     0.09  
117   -0.04  
123    0.09  
141    0.12  
161   -0.08  
169   -0.50  
230   -0.05  
248    0.00  
251   -0.05  
261    0.15  
276   -0.04  
321   -0.09  
322   -0.38  
323   -0.40  
336   -0.01  
349   -0.04  
352    0.15  
353   -0.50  
367    0.03  
376    0.00  
394   -0.43  
405    0.08  
422   -0.01  
441    0.15  

Automating outliers detection with SVM


In [22]:
from sklearn import svm
outliers_fraction = 0.01 # 
nu_estimate = 0.95 * outliers_fraction + 0.05

auto_detection = svm.OneClassSVM(kernel="rbf", gamma=0.01, degree=3, nu=nu_estimate)
auto_detection.fit(Xc)

evaluation = auto_detection.predict(Xc)

print df[evaluation==-1]


     comp_0  comp_1  comp_2  comp_3  comp_4  comp_5  comp_6  comp_7  comp_8  \
10     3.86   -1.53   -0.23    0.66    0.73   -0.53    0.82   -1.42    0.43   
23    -3.77   -1.76    1.09    0.72   -0.64   -1.90    0.56    1.09   -0.44   
32    -2.61   -2.94   -0.39    0.64   -0.48    1.15   -1.38   -0.07   -0.99   
58     2.65    2.23    2.79   -0.63    0.26    0.13    1.44    0.67   -1.01   
76     1.32   -3.81    1.20    0.31    1.01    0.11   -0.11   -0.46    0.24   
84     3.57    0.97   -0.98   -0.30   -1.15    1.36   -0.13    0.62   -0.11   
123   -5.67    2.16   -2.83   -0.98    0.43   -0.22   -1.25   -0.85   -0.70   
141   -3.31   -0.02   -1.76    1.79   -0.94   -1.92   -1.10   -0.77   -0.64   
161   -5.13    1.23   -1.13    0.32    1.91   -0.54   -0.07   -0.15   -0.00   
166    3.41   -1.76   -1.29   -1.08    0.49    0.85    0.34    0.05   -0.09   
187    4.51   -0.28   -0.22    0.29    0.48    0.82    0.40   -0.33   -0.24   
202   -3.03    2.11    0.25   -1.55   -0.14    0.97    1.14    1.03    0.34   
230   -3.93    2.59   -2.08    0.03    1.03   -0.84   -1.40    1.38    0.34   
248   -4.05    2.23   -1.18    1.33    1.02    0.14    0.35   -0.92   -0.23   
260    2.89    2.19    1.68   -0.42   -0.36   -0.04   -1.26    0.88   -0.82   
261    1.30    2.40    3.81   -0.57    1.13   -0.22    0.86   -0.27   -0.80   
286    3.74    1.13    0.90    0.43   -0.04   -1.50    0.23    1.25   -0.84   
321   -4.64   -0.12    0.59    0.91   -1.64    0.36    0.82   -1.09   -0.90   
322   -4.52   -2.24   -0.14    0.85   -0.47   -0.73    1.28    0.34   -1.39   
336   -4.07    1.13   -2.09    1.95   -0.71   -0.26   -0.43    0.15   -0.69   
349    1.53   -2.64   -1.94   -1.33   -1.07    0.79    0.11   -0.77   -0.19   
405   -0.66   -3.94    1.79    0.87   -0.13    1.97    0.09    0.77   -0.14   
406    3.91    0.71   -1.71   -0.12   -0.43    0.94    0.83    0.13   -0.15   
425    4.15   -1.18   -0.48    1.21   -0.34   -0.62    0.52    0.65    0.07   
441    1.90    3.98   -0.05   -0.22    0.60   -1.65    0.25    1.19   -0.73   

     comp_9  
10    -0.09  
23    -0.50  
32     0.15  
58     0.21  
76    -0.00  
84    -0.09  
123    0.09  
141    0.12  
161   -0.08  
166   -0.11  
187   -0.12  
202   -0.15  
230   -0.05  
248    0.00  
260   -0.07  
261    0.15  
286    0.17  
321   -0.09  
322   -0.38  
336   -0.01  
349   -0.04  
405    0.08  
406   -0.02  
425    0.07  
441    0.15  

In [13]:
inliers  = Xc[evaluation==+1,:]
outliers = Xc[evaluation==-1,:]

from matplotlib import pyplot as plt
import pylab as pl

inlying  = plt.plot(inliers[:,0],inliers[:,1], 'o', markersize=2, color='g', alpha=1.0, label='inliers')
outlying = plt.plot(outliers[:,0],outliers[:,1], 'o', markersize=5, color='k', alpha=1.0, label='outliers')
plt.scatter(outliers[:,0],
           outliers[:,1],
           s=100, edgecolors="k", facecolors="none")
plt.xlabel('Component 1 ('+str(round(dim_reduction.explained_variance_ratio_[0],3))+')')
plt.ylabel('Component 2'+'('+str(round(dim_reduction.explained_variance_ratio_[1],3))+')')
plt.xlim([-7,7])
plt.ylim([-6,6])
plt.legend((inlying[0],outlying[0]),('inliers','outliers'),numpoints=1,loc='best')
plt.title("")
plt.show()



In [ ]: